Data preprocessing
We gonna load the datasets for every station, filter the columns and keep the values for every pollutant type. We’ll change the class of time to POSIXct, check for NAs and fill them using na.approx function
Station 1
s1 <- read.csv("station1.csv") # Loading the .csv file we previous create
s1 <- s1 %>% select(c(seq(2,38,4))) # Filtering the columns with pollutant values
names(s1) <- c("time","SO2","PM10","O3","NO2","NOx","CO","C6H6","NO","PM25") #adding the column names
s1$time <- ymd_hms(s1$time) # changing the type of time
sapply(s1, function(x) sum(is.na(x))) # checking for NA
## time SO2 PM10 O3 NO2 NOx CO C6H6 NO PM25
## 0 99 325 35 104 104 371 96 104 325
# Filling NA with na.approx function
s1[,2:10] <- sapply(s1[,2:10], function(x) ifelse(is.na(x), na.approx(x, na.rm = T), x))
sapply(s1, function(x) sum(is.na(x))) # zero NA
## time SO2 PM10 O3 NO2 NOx CO C6H6 NO PM25
## 0 0 0 0 0 0 0 0 0 0
knitr::kable(head(s1,3)) # printing the first 3 rows
| 2020-01-01 00:00:00 |
4.213296 |
113.18510 |
5.422678 |
41.66263 |
64.42079 |
1045.5090 |
5.304624 |
14.83504 |
89.80502 |
| 2020-01-01 01:00:00 |
3.986767 |
49.54911 |
7.720096 |
41.01508 |
90.58797 |
977.6353 |
2.203252 |
32.31437 |
35.75103 |
| 2020-01-01 02:00:00 |
4.940188 |
64.78768 |
4.493460 |
50.22812 |
119.87520 |
1263.8940 |
3.149906 |
45.39985 |
35.52024 |
Station 2
s2 <- read.csv("station2.csv")
s2 <- s2 %>% select(c(seq(2,26,4)))
names(s2) <- c("time","SO2","O3","NO2","NOx","CO","NO")
s2$time <- ymd_hms(s2$time)
sapply(s2, function(x) sum(is.na(x)))
## time SO2 O3 NO2 NOx CO NO
## 0 176 44 100 84 89 50
s2[,2:7] <- sapply(s2[,2:7], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))
Station 3
s3 <- read.csv("station3.csv")
s3 <- s3 %>% select(c(seq(2,30,4)))
names(s3) <- c("time","SO2","PM10","O3","NO2","NOx","CO","NO")
s3$time <- ymd_hms(s3$time)
sapply(s3, function(x) sum(is.na(x)))
## time SO2 PM10 O3 NO2 NOx CO NO
## 0 723 128 124 227 227 709 253
s3[,2:8] <- sapply(s3[,2:8], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))
Station 5
s5 <- read.csv("station5.csv")
s5 <- s5 %>% select(c(seq(2,38,4)))
names(s5) <- c("time","SO2","PM10","O3","NO2","NOx","CO","C6H6","NO","PM25")
s5$time <- ymd_hms(s5$time)
sapply(s5, function(x) sum(is.na(x)))
## time SO2 PM10 O3 NO2 NOx CO C6H6 NO PM25
## 0 199 182 97 64 64 98 183 95 195
s5[,2:10] <- sapply(s5[,2:10], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))
Station 7
s7 <- read.csv("station7.csv")
s7 <- s7 %>% select(c(seq(2,38,4)))
names(s7) <- c("time","SO2","PM10","O3","NO2","NOx","CO","C6H6","NO","PM25")
s7$time <- ymd_hms(s7$time)
sapply(s7, function(x) sum(is.na(x)))
## time SO2 PM10 O3 NO2 NOx CO C6H6 NO PM25
## 0 665 405 121 165 164 332 530 206 479
s7[,2:10] <- sapply(s7[,2:10], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))
Station 8
s8 <- read.csv("station8.csv")
s8 <- s8 %>% select(c(seq(2,34,4)))
names(s8) <- c("time","SO2","PM10","O3","NO2","NOx","C6H6","NO","PM25")
s8$time <- ymd_hms(s8$time)
sapply(s8, function(x) sum(is.na(x)))
## time SO2 PM10 O3 NO2 NOx C6H6 NO PM25
## 0 202 138 76 83 83 759 89 141
s8[,2:9] <- sapply(s8[,2:9], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))
Station 9
s9 <- read.csv("station9.csv")
s9 <- s9 %>% select(c(seq(2,30,4)))
names(s9) <- c("time","SO2","O3","NO2","NOx","CO","C6H6","NO")
s9$time <- ymd_hms(s9$time)
sapply(s9, function(x) sum(is.na(x)))
## time SO2 O3 NO2 NOx CO C6H6 NO
## 0 180 45 108 108 626 938 323
s9[,2:8] <- sapply(s9[,2:8], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))
Station 10
s10 <- read.csv("station10.csv")
s10 <- s10 %>% select(c(seq(2,38,4)))
names(s10) <- c("time","SO2","PM10","O3","NO2","NOx","CO","C6H6","PM25","NO")
s10$time <- ymd_hms(s10$time)
sapply(s10, function(x) sum(is.na(x)))
## time SO2 PM10 O3 NO2 NOx CO C6H6 PM25 NO
## 0 305 1057 107 177 177 224 311 1095 426
s10[,2:10] <- sapply(s10[,2:10], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))
Station 11
s11 <- read.csv("station11.csv")
s11 <- s11 %>% select(c(seq(2,34,4)))
names(s11) <- c("time","SO2","O3","NO2","NOx","CO","NO","PM10","PM25")
s11$time <- ymd_hms(s11$time)
sapply(s11, function(x) sum(is.na(x)))
## time SO2 O3 NO2 NOx CO NO PM10 PM25
## 0 275 113 121 121 83 271 864 864
s11[,2:9] <- sapply(s11[,2:9], function(x) ifelse(is.na(x), na.approx(x, na.rm = TRUE), x))
Changing the time
Next step is gonna be to change the time. Our dataset is in hours so we gonna create two new datasets for every station in which we gonna store mean value for each day and for each month. First we’ll transform the datasets to xts type and then create .d and .m datasets for days and months.
Station 1
s1.xts <- xts(s1, order.by = s1$time) # creating the xts type of s1 dataset
s1.d <- period.apply(s1.xts[,2:10] , endpoints(s1.xts, "days"), mean) #finding the mean for every day
s1.m <- period.apply(s1.xts[,2:10] , endpoints(s1.xts, "months"), mean) #finding the mean for every month
s1.d <- as.data.frame(s1.d) # changing for xts to data.frame type
s1.m <- as.data.frame(s1.m) # changing for xts to data.frame type
Station 2
s2.xts <- xts(s2, order.by = s2$time)
s2.d <- period.apply(s2.xts[,2:7] , endpoints(s2.xts, "days"), mean)
s2.m <- period.apply(s2.xts[,2:7] , endpoints(s2.xts, "months"), mean)
s2.d <- as.data.frame(s2.d)
s2.m <- as.array.default(s2.m)
# Creating a time column to join the datasets later. Only for the stations with missing days
s2.d$time <- as.POSIXct(row.names(s2.d),format = "%Y-%m-%d")
Station 3
s3.xts <- xts(s3, order.by = s3$time)
s3.d <- period.apply(s3.xts[,2:8] , endpoints(s3.xts, "days"), mean)
s3.m <- period.apply(s3.xts[,2:8] , endpoints(s3.xts, "months"), mean)
s3.d <- as.data.frame(s3.d)
s3.m <- as.data.frame(s3.m)
Station 5
s5.xts <- xts(s5, order.by = s5$time)
s5.d <- period.apply(s5.xts[,2:10] , endpoints(s5.xts, "days"), mean)
s5.m <- period.apply(s5.xts[,2:10] , endpoints(s5.xts, "months"), mean)
s5.d <- as.data.frame(s5.d)
s5.m <- as.data.frame(s5.m)
Station 7
s7.xts <- xts(s7, order.by = s7$time)
s7.d <- period.apply(s7.xts[,2:10] , endpoints(s7.xts, "days"), mean)
s7.m <- period.apply(s7.xts[,2:10] , endpoints(s7.xts, "months"), mean)
s7.d <- as.data.frame(s7.d)
s7.m <- as.data.frame(s7.m)
s7.d$time <- as.POSIXct(row.names(s7.d),format = "%Y-%m-%d")
Station 8
s8.xts <- xts(s8, order.by = s8$time)
s8.d <- period.apply(s8.xts[,2:9] , endpoints(s8.xts, "days"), mean)
s8.m <- period.apply(s8.xts[,2:9] , endpoints(s8.xts, "months"), mean)
s8.d <- as.data.frame(s8.d)
s8.m <- as.data.frame(s8.m)
s8.d$time <- as.POSIXct(row.names(s8.d),format = "%Y-%m-%d")
Station 9
s9.xts <- xts(s9, order.by = s9$time)
s9.d <- period.apply(s9.xts[,2:8] , endpoints(s9.xts, "days"), mean)
s9.m <- period.apply(s9.xts[,2:8] , endpoints(s9.xts, "months"), mean)
s9.d <- as.data.frame(s9.d)
s9.m <- as.data.frame(s9.m)
Station 10
s10.xts <- xts(s10, order.by = s10$time)
s10.d <- period.apply(s10.xts[,2:10] , endpoints(s10.xts, "days"), mean)
s10.m <- period.apply(s10.xts[,2:10] , endpoints(s10.xts, "months"), mean)
s10.d <- as.data.frame(s10.d)
s10.m <- as.data.frame(s10.m)
Station 11
s11.xts <- xts(s11, order.by = s11$time)
s11.d <- period.apply(s11.xts[,2:9] , endpoints(s11.xts, "days"), mean)
s11.m <- period.apply(s11.xts[,2:9] , endpoints(s11.xts, "months"), mean)
s11.d <- as.data.frame(s11.d)
s11.m <- as.data.frame(s11.m)
deleting the datasets we dont gonna use
rm(s1,s1.xts,s10, s10.xts, s11, s11.xts,s2,s2.xts,s3, s3.xts, s5, s5.xts,
s7, s7.xts, s8, s8.xts,s9, s9.xts, s10, s10.xts, s11, s11.xts)
PM10 Analysis
Creating the dataframe for pm10 with day columns
# Creating a data.frame with all pm10 measurement values. Stations 1,3,5,10 and 11 have no missing days
pm10 <- data.frame("station1"=s1.d$PM10,"station3"=s3.d$PM10,"station5"=s5.d$PM10,
"station10"=s10.d$PM10,"station11"=s11.d$PM10)
# Creating a time column to help us join previous dataframe with station 7 and 8.
pm10$time <- as.POSIXct(row.names(s1.d),format = "%Y-%m-%d")
# Full join pm10 and station7 by time column
pm10 <- full_join(pm10,s7.d[,c("time","PM10")],by="time")
# Full join pm10 and station8 by time column
pm10 <- full_join(pm10,s8.d[,c("time","PM10")],by="time")
# Changing the column names, changing the order and filling NAs at station 7 and 8 with mean values
pm10 <- pm10 %>% rename(station7=PM10.x, station8=PM10.y) %>%
select(time,station1,station3,station5,station7,station8,station10,station11) %>%
mutate(station7=ifelse(is.na(station7),mean(station7, na.rm=T),station7),
station8=ifelse(is.na(station8),mean(station8, na.rm=T),station8))
# Adding a mean column with mean value of PM10 for every day
pm10$mean <- rowMeans(pm10[,2:8])
knitr::kable(head(pm10,3)) # printing the first 3 columns
| 2020-01-01 |
42.89521 |
18.62904 |
34.34300 |
20.03441 |
26.282481 |
3.927083 |
26.23751 |
24.62125 |
| 2020-01-02 |
20.59653 |
10.59705 |
23.11284 |
17.06729 |
10.038982 |
3.721084 |
30.37896 |
16.50182 |
| 2020-01-03 |
10.83952 |
11.74851 |
13.33836 |
16.47911 |
6.818125 |
3.705833 |
30.29477 |
13.31775 |
Creating the dataframe for pm10 with monthly columns
# adding the values to dataframe
pm10.m <- data.frame(s1.m$PM10,s3.m$PM10,s5.m$PM10,s7.m$PM10,
s8.m$PM10,s10.m$PM10,s11.m$PM10)
# Creating a column with mean values for every month
pm10.m$mean <- rowMeans(pm10.m)
# Adding row and column names
rownames(pm10.m) <- c("January","February","March","April","May","June")
colnames(pm10.m) <- c(stations[c(1,3:6,8:9)],"mean")
knitr::kable(pm10.m, digits = 4) #printing the table
| January |
36.3907 |
25.0343 |
30.1276 |
25.8627 |
16.7807 |
7.6372 |
28.1558 |
24.2841 |
| February |
30.6833 |
27.2613 |
28.6184 |
26.1846 |
21.0343 |
11.2538 |
28.1471 |
24.7404 |
| March |
30.3524 |
27.4642 |
33.2767 |
26.4542 |
25.0162 |
20.2937 |
29.5287 |
27.4837 |
| April |
25.1905 |
23.7127 |
27.7513 |
21.3120 |
25.6438 |
19.0282 |
25.3662 |
24.0007 |
| May |
35.7941 |
35.3375 |
36.1732 |
35.4699 |
35.6575 |
25.1839 |
32.6934 |
33.7585 |
| June |
25.9754 |
26.6133 |
34.1944 |
23.4294 |
32.6121 |
18.3832 |
31.7686 |
27.5681 |
Summary statistics of PM10 for each station
colnames(pm10) <- c("time",stations[c(1,3:6,8:9)],"mean")
knitr::kable(do.call(cbind, lapply(pm10[2:9], summary)))
| Min. |
10.83952 |
10.00244 |
11.00115 |
9.065349 |
4.911398 |
3.240060 |
12.13500 |
12.47557 |
| 1st Qu. |
21.24346 |
19.09453 |
22.15374 |
18.909459 |
17.613646 |
9.620417 |
21.06490 |
20.33870 |
| Median |
26.44153 |
25.34259 |
28.60686 |
24.125281 |
22.768948 |
14.252989 |
26.51567 |
24.25839 |
| Mean |
30.78019 |
27.58017 |
31.70585 |
26.173728 |
26.282481 |
17.039248 |
29.27912 |
26.97726 |
| 3rd Qu. |
37.40693 |
32.06226 |
36.45506 |
29.643809 |
30.963913 |
19.605729 |
33.07865 |
29.77486 |
| Max. |
97.42616 |
94.58720 |
96.06582 |
89.734498 |
112.569130 |
82.874167 |
82.96542 |
90.03731 |
Data preperation
pm10.t <- as.data.frame(t(as.matrix(pm10))) # transpose the pm10 matrix
# changing row and column names
rownames(pm10.t) <- c("time",stations[c(1,3:6,8:9)],"mean")
colnames(pm10.t) <- pm10$time
pm10.bar2 <- pm10.t[2:8,]
pm10.bar2$row.names <- rownames(pm10.bar2)
# changing the data from wide format to a single column of data
pm10.bar2<-melt(pm10.bar2,id=c("row.names"))
# Changing the type of class
pm10.bar2$value <- as.numeric(pm10.bar2$value)
pm10.bar2$variable <- ymd(pm10.bar2$variable)
pm10.bar2$row.names <- as.factor(pm10.bar2$row.names)
Boxplot for each station
ggplot(data = pm10.bar2, aes(x = row.names, y = value, fill=row.names)) +geom_boxplot() +
theme(legend.position = "none") + labs(title="PM10 boxplot for all stations", x="Stations",y="PM10") +
theme(axis.text.x = element_text(angle = 60, hjust = 1))

PM10 time series.
The green line is the air quality guideline from WHO and the pink line is the established EU limit value. The dashed orange line is the date Cyprus banned the flights and the red dashed line is the date the lockdown started. We can see there are some peaks as a result of dust episodes. We can see that COVID19 situation did not affect the values of PM10.
ggplot(pm10, aes(x=time,y=mean)) + geom_line() + geom_smooth() +
geom_vline(aes(xintercept=as.numeric(pm10$time[84]),colour="Lockdown"),linetype=2, size=1) +
geom_vline(aes(xintercept=as.numeric(pm10$time[81]),colour="Flight_ban"),linetype=2, size=1) +
labs(y="PM10 value", x="Month",title="January to June 2020 Time series for PM10") +
geom_hline(aes(yintercept = 40, colour="EU_limit_value"), linetype=1) +
geom_hline(aes(yintercept = 20, colour="WHO_air_quality_guideline"), linetype=1) +
scale_color_manual(name = "Line notation", values = c(Lockdown="red",EU_limit_value="magenta",
WHO_air_quality_guideline="green",Flight_ban="orange")) +
theme_classic() +
annotate("text",x=pm10$time[140],y = 95,label="Dust Episode", size =4, fontface="bold", color="brown" )

Time Series plot for every station
ggplot(pm10.bar2, aes(x=variable,y=as.numeric(value))) + geom_area(aes(color = row.names, fill = row.names),
alpha = 0.7) + theme_bw() +
labs(y="PM10 value", x="Month",title="January to June 2020 PM10 Time series for every station") +
labs(fill='Station') + labs(color="Station")

ggplot(pm10.bar2, aes(x=variable,y=as.numeric(value))) + geom_line(aes(color = row.names),
alpha = 0.7, size=0.7) + theme_classic() +
labs(y="PM10 value", x="Month",title="January to June 2020 PM10 Time series for every station") +
labs(color="Station")

PM10 daily difference
COVID19 situation doesn’t change the stability of PM10 and the differences seems to be almost the same before and after lockdown. The highest differences can be spotted the first 2 weeks after lockdown.
pm10.ts <- ts(pm10[9], start=c(2020,1) , frequency = 365)
pm10.diff <- diff(pm10.ts)
pm10.dif <- data.frame("time"=pm10$time,"diff"=c(0,pm10.diff))
ggplot(pm10.dif, aes(x=time,y=diff))+geom_line(color=I("red")) + xlab("Months")+ ylab("PM10 difference") +
ggtitle("PM10 daily difference") + theme_classic() + geom_hline(yintercept = 0, linetype="dashed")

Heatmap plot of monthly values
pm10.mt <- as.data.frame(t(as.matrix(pm10.m))) ##transpose of month matrix
rownames(pm10.mt) <- c(stations[c(1,3:6,8:9)],"mean") #adding row names
heatmaply(pm10.mt, grid_color = "white", dendrogram = "none", main = "PM10 per month for all Stations and mean values",xlab = "Months",ylab="Stations",cellnote = round(pm10.mt,2))
Barplot of PM10 for every month and visual contribution of each station
pm10.bar <- pm10.mt[1:7,] # row filtering
pm10.bar$row.names <- rownames(pm10.bar) # adding column with row names
# changing the data from wide format to a single column of data
pm10.bar<-melt(pm10.bar,id=c("row.names"))
# creating the plot
ggplot(pm10.bar,aes(x=variable,y=value/7,fill=row.names))+geom_bar(position="stack", stat="identity") +
labs(title = "PM10 per month and station contribution", y = "PM10 value", x = "Months") +
labs(fill = "Stations")

Hierarchical Analysis
I choose to create three clusters. The clusters seems to be very reasonal. The green cluster includes four of five traffic stations. The red includes only Ayia_Marina_Xyliaroy station, the only Rural Station of our dataset and the blue includes Paphos traffic station and Zygi industrial station.
pm10.t2 <- as.data.frame(t(as.matrix(pm10[,2:8])))
row.names(pm10.t2)<- stations[c(1,3:6,8:9)]
pm10.t2$station <- stations[c(1,3:6,8:9)]
pm10.scaled <- as.data.frame(scale(pm10.t2[,1:182])) # scaling
dist_pm10 <- dist(pm10.scaled, method = 'euclidean') # calculating the distance
hclust_pm10 <- hclust(dist_pm10, method = 'complete') # creating the HC
# plotting th dendrogram
plot(hclust_pm10,labels=pm10.t2$station)
rect.hclust(hclust_pm10 , k = 3,border = 2:6) # adding the 3 cluster boxes

Elbow rule.
We can see that a good choice would be to choose k=2
fviz_nbclust(pm10.scaled, kmeans, method = "wss", k.max=6)

PCA
pm10_pca <- prcomp(pm10.t2[,-183], center = TRUE, scale. = TRUE)
summary(pm10_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 9.0772 5.8317 4.9680 4.17065 3.4946 3.36293 4.183e-15
## Proportion of Variance 0.4527 0.1869 0.1356 0.09557 0.0671 0.06214 0.000e+00
## Cumulative Proportion 0.4527 0.6396 0.7752 0.87076 0.9379 1.00000 1.000e+00
Cluster plot with PCA
This biplot for the first two principal components, accounting for 64% of the total variance
sub_grp <- cutree(hclust_pm10, k = 3)
fviz_cluster(list(data = pm10.scaled, cluster = sub_grp))

PCA 3d plot
This 3d plot for the first three principal components, accounting for 77% of the total variance
pca3d(pm10_pca, group=sub_grp, radius = 4, show.labels=T)
## [1] 1.4917757 0.7329541 0.5801611
## Creating new device
NOx Analysis
Creating the dataframe for NOx with day columns
nox <- data.frame("station1"=s1.d$NOx,"station3"=s3.d$NOx ,"station5"=s5.d$NOx ,
"station9"=s9.d$NOx,"station10"=s10.d$NOx,"station11"=s11.d$NOx)
nox$time <- as.POSIXct(row.names(s1.d),format = "%Y-%m-%d")
nox <- full_join(nox,s2.d[,c("time","NOx")],by="time")
nox <- full_join(nox,s7.d[,c("time","NOx")],by="time")
nox <- full_join(nox,s8.d[,c("time","NOx")],by="time")
nox <- nox %>% rename(station2=NOx.x, station7=NOx.y, station8=NOx) %>%
select(time,station1,station2,station3,station5,station7,station8,
station9,station10,station11) %>%
mutate(station2=ifelse(is.na(station2),mean(station2, na.rm=T),station2),
station7=ifelse(is.na(station7),mean(station7, na.rm=T),station7),
station8=ifelse(is.na(station8),mean(station8, na.rm=T),station8))
nox$mean <- rowMeans(nox[,2:10])
knitr::kable(head(nox,3))
| 2020-01-01 |
82.36372 |
45.79606 |
40.62136 |
58.09139 |
25.14944 |
12.34307 |
8.669477 |
1.704961 |
35.21159 |
34.43901 |
| 2020-01-02 |
56.73409 |
40.99054 |
55.56007 |
45.45797 |
26.65660 |
13.26133 |
10.280462 |
2.418991 |
39.59478 |
32.32831 |
| 2020-01-03 |
42.54235 |
49.56395 |
86.81541 |
65.22078 |
36.50002 |
12.76792 |
14.621379 |
2.824170 |
39.55709 |
38.93479 |
Creating the dataframe for pm10 with monthly columns
nox.m <- data.frame(s1.m$NOx,s2.m$NOx,s3.m$NOx,
s5.m$NOx,s7.m$NOx,s8.m$NOx,
s9.m$NOx,s10.m$NOx,s11.m$NOx)
nox.m$Mean <- rowMeans(nox.m)
row.names(nox.m)<-c("January","February","March","April","May","June")
colnames(nox.m)<-c(stations,"Mean")
knitr::kable(nox.m, digits = 2)
| January |
91.52 |
52.55 |
65.59 |
59.05 |
30.98 |
13.16 |
12.87 |
2.92 |
42.54 |
41.24 |
| February |
63.56 |
42.07 |
57.72 |
44.19 |
24.97 |
15.39 |
14.25 |
3.67 |
30.46 |
32.92 |
| March |
33.44 |
21.28 |
34.36 |
29.16 |
15.23 |
13.05 |
10.73 |
3.22 |
16.98 |
19.72 |
| April |
15.85 |
8.54 |
13.42 |
11.89 |
6.16 |
7.63 |
7.94 |
2.38 |
9.91 |
9.30 |
| May |
25.51 |
12.79 |
23.90 |
20.43 |
16.51 |
9.49 |
10.92 |
2.28 |
14.14 |
15.11 |
| June |
23.31 |
14.33 |
27.21 |
20.37 |
10.07 |
15.36 |
18.16 |
1.82 |
15.09 |
16.19 |
colnames(nox) <- c("time",stations,"mean")
knitr::kable(do.call(cbind, lapply(nox[2:11], summary)))
| Min. |
4.381602 |
1.701600 |
4.374026 |
3.925617 |
1.096979 |
3.354127 |
4.109603 |
0.8867552 |
4.751423 |
4.282998 |
| 1st Qu. |
18.641329 |
9.213398 |
17.151341 |
14.350127 |
6.056446 |
8.697741 |
8.450774 |
2.0224395 |
10.675313 |
12.574391 |
| Median |
28.274242 |
16.176603 |
30.812245 |
22.375224 |
15.017890 |
12.035261 |
10.922486 |
2.4228941 |
15.611197 |
17.676432 |
| Mean |
42.206705 |
25.325023 |
36.994232 |
30.840662 |
17.387691 |
12.343074 |
12.448582 |
2.7189122 |
21.518156 |
22.420337 |
| 3rd Qu. |
54.810168 |
39.055830 |
52.007920 |
44.548574 |
25.405847 |
15.280685 |
15.560341 |
3.0809064 |
24.351356 |
31.687493 |
| Max. |
158.532276 |
100.709014 |
139.149394 |
126.046744 |
75.349972 |
27.850411 |
28.655146 |
8.9129858 |
113.762156 |
68.516004 |
data preperation
nox.t <- as.data.frame(t(as.matrix(nox)))
# changing row and column names
colnames(nox.t) <- nox$time
nox.bar <- nox.t[2:10,]
nox.bar$row.names <- rownames(nox.bar)
# changing the data from wide format to a single column of data
nox.bar<-melt(nox.bar,id=c("row.names"))
nox.bar$value <- as.numeric(nox.bar$value)
nox.bar$variable <- ymd(nox.bar$variable)
nox.bar$row.names <- as.factor(nox.bar$row.names)
Boxplot for each station
ggplot(data = nox.bar, aes(x = row.names, y = value, fill=row.names)) +geom_boxplot() +
theme(legend.position = "none") + labs(title="PM10 boxplot for all stations", x="Stations",y="PM10") +
theme(axis.text.x = element_text(angle = 60, hjust = 1))

NOx time series
The green line is the air quality guideline from WHO and the pink line is the established EU limit value. The dashed orange line is the date Cyprus banned the flights and the red dashed line is the date the lockdown started. Some peaks are a result of dust episodes. We can see that COVID19 situation affect the values of NOx.
ggplot(nox, aes(x=time,y=mean)) + geom_line() + geom_smooth() +
geom_vline(aes(xintercept=as.numeric(pm10$time[84]),colour="Lockdown"),linetype=2, size=1) +
geom_vline(aes(xintercept=as.numeric(pm10$time[81]),colour="Flight_ban"),linetype=2, size=1) +
geom_hline(aes(yintercept = 30, colour="EU_limit_value"), linetype=1) +
geom_hline(aes(yintercept = 40, colour="WHO_air_quality_guideline"), linetype=1) +
labs(y="NOx value", x="Month",title="January to June 2020 Time series for NOx") +
scale_color_manual(name = "statistics", values = c(Lockdown="red",EU_limit_value="magenta",
Flight_ban="orange",WHO_air_quality_guideline="green")) +
theme_classic() +
annotate("text",x=pm10$time[140],y = 37,label="Dust Episode", size =4, fontface="bold", color="brown" )

Time Series plot for every station
ggplot(nox.bar, aes(x=variable,y=as.numeric(value))) + geom_area(aes(fill = row.names, colour=row.names),
alpha = 0.3) + theme_bw() +
labs(y="NOx value", x="Month",title="January to June 2020 NOx Time series for every station") +
labs(fill="Station") + labs(colour="Station")

ggplot(nox.bar, aes(x=variable,y=as.numeric(value))) + geom_line(aes(colour=row.names), alpha=0.7, size=0.7) + theme_bw() +
labs(y="NOx value", x="Month",title="January to June 2020 NOx Time series for every station") +
labs(colour="Station")

NOx daily difference
After COVID19 situation we can see that the daily difference is low and NOx seems to be more stable.
nox.ts <- ts(nox[11], start=c(2020,1) , frequency = 365)
nox.diff <- diff(nox.ts)
nox.diff <- data.frame("time"=nox$time,"diff"=c(0,nox.diff))
ggplot(nox.diff, aes(x=time,y=diff))+geom_line(color=I("red")) + xlab("Months")+ ylab("NOx difference") +
ggtitle("NOx daily difference") + theme_classic() + geom_hline(yintercept = 0, linetype="dashed")

Heatmap plot for NOx monthly values for each station
nox.mt <- as.data.frame(t(as.matrix(nox.m)))
heatmaply(nox.mt, grid_color = "white", dendrogram = "none", main = "NOx per month for all Stations and mean values",xlab = "Months",ylab="Stations",cellnote = round(nox.mt,2))
Barplot of NOx for every month and visual contribution of each station
nox.bar2 <- nox.mt[1:9,]
nox.bar2$row.names <- rownames(nox.bar2) # adding column with row names
# changing the data from wide format to a single column of data
nox.bar2<-melt(nox.bar2,id=c("row.names"))
# creating the plot
ggplot(nox.bar2,aes(x=variable,y=value/9,fill=row.names))+geom_bar(position="stack", stat="identity") +
labs(title = "NOx per month and station contribution", y = "NOx value", x = "Months") +
labs(fill = "Stations")

Elbow rule for NOx
fviz_nbclust(nox.scaled, kmeans, method = "wss", k.max=8)

PCA
nox_pca <- prcomp(nox.t2[,-183], center = TRUE, scale. = TRUE)
summary(nox_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 11.1594 4.3701 3.69591 2.69370 2.4766 2.2210 1.96366
## Proportion of Variance 0.6842 0.1049 0.07505 0.03987 0.0337 0.0271 0.02119
## Cumulative Proportion 0.6842 0.7892 0.86423 0.90410 0.9378 0.9649 0.98609
## PC8 PC9
## Standard deviation 1.59114 2.641e-15
## Proportion of Variance 0.01391 0.000e+00
## Cumulative Proportion 1.00000 1.000e+00
Cluster plot with PCA
This biplot for the first two principal components, accounting for 79% of the total variance
sub_grp2 <- cutree(hclust_nox, k = 3)
fviz_cluster(list(data = nox.scaled, cluster = sub_grp2))

PCA 3d plot
This 3d plot for the first three principal components, accounting for 86% of the total variance
pca3d(nox_pca, group=sub_grp2, radius = 4, show.labels=T)
## [1] 1.5326401 0.4950112 0.4708179
Loading the covid database
covid <- read.csv("owid-covid-data.csv")
covid <- covid %>% filter(location=="Cyprus") # filtering the Cyprus data
covid$date <- ymd(covid$date) # changing the date class
Joining the covid and pm10 dataset
pm10$time <- ymd(pm10$time) # changing the time class
# left join the two dataset using time and date column
pm10.cov <- merge(pm10, covid, by.x="time",by.y="date", all.x =T )
# removing columns with almost zero variance
pm10.cov <- pm10.cov %>% select(-nearZeroVar(pm10.cov))
# filtering the columns we need
pm10.cov2 <- pm10.cov %>% select(c(1,9:13))
# filling NA with 0 for the dates before COVID19
pm10.cov2[,3:6] <- sapply(pm10.cov2[,3:6], function(x) ifelse(is.na(x),0,x))
Correlation matrix
# creating a function to setup the geom_smooth graph of ggpair plot
my_fn <- function(data, mapping, method="loess", ...){
p <- ggplot(data = data, mapping = mapping) +
geom_point() +
geom_smooth(method=method, color="black", ...)
p
}
ggpairs(pm10.cov2[,2:6], aes(color="red", alpha=0.3), lower=list(continuous=my_fn),
upper = list(continuous = wrap("cor", size = 3))) + theme_bw()

creating the corrplot
We can see that there is no clear correlation between PM10 and COVID19. The correlation values are close to zero so there is almost zero impact of coronavirus to values of PM10.
cor.pm10 <- cor(pm10.cov2[,2:6])
corrplot.mixed(cor.pm10)

Join the nox and covid database
nox$time <- ymd(nox$time)
# left join between nox and covid dataset
nox.cov <- merge(nox, covid, by.x="time",by.y="date", all.x =T )
nox.cov <- nox.cov %>% select(-nearZeroVar(nox.cov))
nox.cov2 <- nox.cov %>% select(c(1,11:15))
nox.cov2[,3:6] <- sapply(nox.cov2[,3:6], function(x) ifelse(is.na(x),0,x))
Correlation matrix between mean value of NOx and COVID19 values
ggpairs(nox.cov2[,2:6], aes(color="red", alpha=0.3), lower=list(continuous=my_fn),
upper = list(continuous = wrap("cor", size = 3))) + theme_bw()

We can see that we have a negative correlation between NOx value and total cases of coronavirus. That negative correlation means that when total cases increase the NOx mean value descreases. So there is COVID19 impact in the NOx values.
cor.nox <- cor(nox.cov2[,2:6])
corrplot.mixed(cor.nox)
